####################################
# Title - Event-Based Framing of Democracy in American News Media
# Date - May 28th, 2024
# Goal - Juncture analysis -  ratio
####################################

rm(list=ls())

# Library
library(quanteda)
library(dplyr)
library(text2vec)
library(conText)
library(ggplot2)
library(lubridate)
library(tidyr)

# Dataset
data<-readRDS("data/nyt_cleaned_articles.rds")
glove_vectors <- readRDS("data/glove.rds") # Embedding model is already trained by stanford researchers
transform_vectors<-readRDS("data/khodakA.rds")
result_dem_mean_month<-readRDS("data/dictionary_demo2_cosine_mean_month.rds")

#time interval
time_intervals <- c(1,3,6,9,12)
#########################
#
# 1. Preprocessing
#
#########################

data$pub_month_factor<-as.factor(data$pub_month)
data$pub_year_factor<-as.factor(data$pub_year)
data$pub_year<-as.numeric(data$pub_year)
data$global<-1

text_corpus <- corpus(data,
                      docid_field = "docid",
                      text_field = "clean_text",
                      meta = T)

# tokenize corpus removing unnecessary (i.e. semantically uninformative) elements
toks <- tokens(text_corpus, remove_punct=T, remove_symbols=T, remove_numbers=F, remove_url = T, remove_separators=T)

# clean out stopwords and words with 2 or fewer characters
toks_nostop <- tokens_select(toks, pattern = stopwords("en"), selection = "remove", min_nchar=3)

# only use features that appear at least 5 times in the corpus
feats <- dfm(toks_nostop, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 5) %>% featnames()

# check spelling. toupper avoids names being considered misspelled
if (requireNamespace("hunspell", quietly = TRUE)) {
  library(hunspell) # spell check library
  spellcheck <-  hunspell_check(toupper(feats), dict = hunspell::dictionary("en_US"))
  feats <- feats[spellcheck]
}

# leave the pads so that non-adjacent words will not become adjacent
toks_nostop_feats <- tokens_select(toks_nostop, feats, padding = TRUE)


# build a tokenized corpus of contexts surrounding the target term "democracy"
demo_pattern <- c("democracy", "Democracy", "DEMOCRACY",
                  "democracies", "Democracies", "DEMOCRACIES",
                  "democracy's", "Democracy's", "DEMOCRACY's",
                  "democratic")
demo_toks <- tokens_context(x = toks_nostop_feats, pattern = demo_pattern, window = 6L,
                            valuetype = "fixed", case_insensitive = FALSE, hard_cut = FALSE, rm_keyword = FALSE,
                            verbose = TRUE) #No regex which include "undemocracy

rm(text_corpus, toks, toks_nostop, feats)

# build document-feature matrix
demo_dfm <- dfm(demo_toks)

# build a document-embedding-matrix
demo_dem <- dem(x = demo_dfm, 
                pre_trained = glove_vectors, # pre-trained glovel model with 300-dim 
                transform = TRUE, 
                transform_matrix = transform_vectors, 
                verbose = TRUE)

##############################
#
# 2. Cosine similarity for each dictionary
#
##############################

#define dictionary
elec_dict <- c("multiparty", "one-vote", "enfranchisement", "suffrage", "voter", "voters", "election", "voting", "vote", "votes", "ballot")
lib_dict <-c("pluralism", "freedom", "freedoms", "liberty", "liberties", "rights", "individuality", 
             "constitutional", "constitutionalism", "constitutions")
parti_dict<-c("activism", "grassroots", "grass-roots", "movement", "demonstrations", "participation", 
              "rallies", "protest", "protests", "plebiscite")
delib_dict<-c("deliberative", "consensus-building", "consensus", "dialogue", "thoughtful", "deliberation")
egal_dict <-c("egalitarian","egalitarianism", "equality", "unequal","inequality","inequalities", "entitlement","welfare",
              "disparities", "disparity", "equal")
autho_dict<-c("meritocratic","elites","elitist","centralized", "top-down", "charisma")

all_dict <- c(elec_dict, lib_dict, parti_dict, delib_dict, egal_dict, autho_dict)

##################################
#
# 3. Define function
#
##################################
set_juncture <- function(toks, juncture_name, date1, date2, date3) {
  docvars(toks)$target_value <- ifelse(
    docvars(toks)$pub_date < as.Date(date1) & docvars(toks)$pub_date >= as.Date(date2), "Pre-juncture",
    ifelse(docvars(toks)$pub_date >= as.Date(date1) & docvars(toks)$pub_date <= as.Date(date3), "Pro-juncture", "out_window")
  )
  docvars(toks)<-docvars(toks) %>% rename(!!juncture_name := target_value)
  return(toks)
}
ratio_cos <-function(toks, juncture_name){
  cos_juncture<-get_nns_ratio(x = toks,
                              N=30,
                              groups = docvars(demo_toks_sub, juncture_name),
                              numerator = "Pro-juncture",
                              candidates = local_vocab,
                              pre_trained = glove_vectors,
                              transform = TRUE,
                              transform_matrix = transform_vectors,
                              bootstrap = TRUE,
                              num_bootstraps = 100,
                              permute = TRUE,
                              num_permutations = 100,
                              stem = FALSE, 
                              verbose = FALSE)
  return(cos_juncture)
}

#########################
#
# Critical juncture - Trump
#
######################### 
  #Trump - announced June 16, 2015/ primary - Feb 1 2016 / elected November 8, 2016
  
  ##Primary 
  juncture <- "junc_trump_primary"
 
  for (i in time_intervals){
    start_date = as.Date("2016-02-01")
    start_month = start_date %m-% months(i)
    end_month   = start_date %m+% months(i)
    
    #set docvars
    demo_toks  <- set_juncture(demo_toks,  juncture, start_date, start_month, end_month)
    demo_toks_sub   <- tokens_subset(demo_toks, junc_trump_primary %in% c("Pre-juncture", "Pro-juncture"))
    local_vocab <- get_local_vocab(demo_toks_sub, pre_trained = glove_vectors)
    
    # Calculate cosine similirarty
    set.seed(2021L)
    juncture_dem_ratio_juncture  <- ratio_cos(demo_toks_sub, juncture)
    juncture_dem_ratio_juncture$window <- i
    
    
    #visualization
    juncture_dem_ratio_juncture %>% ggplot(aes(x = value, y= feature , xmin = lower.ci, xmax = upper.ci))+
      geom_point(size = 1.5)+
      geom_errorbarh(height = 0.1) +
      xlab("Cosine similarity ratio (Pre vs Pro-juncture)")+
      ylab(paste0(i, "-months"))+
      geom_vline(xintercept = 1, color = "red") +
      theme_bw()+
      theme(axis.text.y = element_text(size = 15))
    ggsave(paste0("fig/juncture/trump_election main/", juncture, "_", i ,"_ratio.jpeg"), width = 12, height = 8, dpi = 1000)
    docvars(demo_toks)<-docvars(demo_toks) %>% select(!juncture)
  } 
  
  ##Announcement
    juncture <- "junc_trump_announce"
   
    for (i in time_intervals){
      start_date = as.Date("2015-06-16")
      start_month = start_date %m-% months(i)
      end_month   = start_date %m+% months(i)
      
      #set docvars
      demo_toks  <- set_juncture(demo_toks,  juncture, start_date, start_month, end_month)
      demo_toks_sub   <- tokens_subset(demo_toks, junc_trump_announce %in% c("Pre-juncture", "Pro-juncture"))
      local_vocab <- get_local_vocab(demo_toks_sub, pre_trained = glove_vectors)
      
      # Calculate cosine similirarty
      set.seed(2021L)
      juncture_dem_ratio_juncture  <- ratio_cos(demo_toks_sub, juncture)
      juncture_dem_ratio_juncture$window <- i
      
      
      #visualization
      juncture_dem_ratio_juncture %>% ggplot(aes(x = value, y= feature , xmin = lower.ci, xmax = upper.ci))+
        geom_point(size = 1.5)+
        geom_errorbarh(height = 0.1) +
        xlab("Cosine similarity ratio (Pre vs Pro-juncture)")+
        ylab(paste0(i, "-months"))+
        geom_vline(xintercept = 1, color = "red") +
        theme_bw()+
        theme(axis.text.y = element_text(size = 15))
      ggsave(paste0("fig/juncture/trump_election main/", juncture, "_", i ,"_ratio.jpeg"), width = 12, height = 8, dpi = 1000)
      docvars(demo_toks)<-docvars(demo_toks) %>% select(!juncture)
    } 
    
  ## Elected
    juncture <- "junc_trump_election"
    
    for (i in time_intervals){
      start_date = as.Date("2016-11-08")
      start_month = start_date %m-% months(i)
      end_month   = start_date %m+% months(i)
      
      #set docvars
      demo_toks  <- set_juncture(demo_toks,  juncture, start_date, start_month, end_month)
      demo_toks_sub   <- tokens_subset(demo_toks, junc_trump_election %in% c("Pre-juncture", "Pro-juncture"))
      local_vocab <- get_local_vocab(demo_toks_sub, pre_trained = glove_vectors)
      
      # Calculate cosine similirarty
      set.seed(2021L)
      juncture_dem_ratio_juncture  <- ratio_cos(demo_toks_sub, juncture)
      juncture_dem_ratio_juncture$window <- i
      
      
      #visualization
      juncture_dem_ratio_juncture %>% ggplot(aes(x = value, y= feature , xmin = lower.ci, xmax = upper.ci))+
        geom_point(size = 1.5)+
        geom_errorbarh(height = 0.1) +
        xlab("Cosine similarity ratio (Pre vs Pro-juncture)")+
        ylab(paste0(i, "-months"))+
        geom_vline(xintercept = 1, color = "red") +
        theme_bw()+
        theme(axis.text.y = element_text(size = 15))
      ggsave(paste0("fig/juncture/trump_election main/", juncture, "_", i ,"_ratio.jpeg"), width = 12, height = 8, dpi = 1000)
      docvars(demo_toks)<-docvars(demo_toks) %>% select(!juncture)
    } 